<!DOCTYPE html>
<html lang="en-us">
  <head>

    <meta http-equiv="content-type" content="text/html; charset=utf-8">
    
<meta charset="UTF-8">
<title>Analysis and Analyzers | Elasticsearch: The Definitive Guide [2.x] | Elastic</title>
<link rel="home" href="index.html" title="Elasticsearch: The Definitive Guide [2.x]">
<link rel="up" href="mapping-analysis.html" title="Mapping and Analysis">
<link rel="prev" href="inverted-index.html" title="Inverted Index">
<link rel="next" href="mapping-intro.html" title="Mapping">
<meta name="DC.type" content="Learn/Docs/Legacy/Elasticsearch/Definitive Guide/2.x">
<meta name="DC.subject" content="Elasticsearch">
<meta name="DC.identifier" content="2.x">
<meta name="robots" content="noindex,nofollow">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <script src="https://cdn.optimizely.com/js/18132920325.js"></script>
    <link rel="apple-touch-icon" sizes="57x57" href="/apple-icon-57x57.png">
    <link rel="apple-touch-icon" sizes="60x60" href="/apple-icon-60x60.png">
    <link rel="apple-touch-icon" sizes="72x72" href="/apple-icon-72x72.png">
    <link rel="apple-touch-icon" sizes="76x76" href="/apple-icon-76x76.png">
    <link rel="apple-touch-icon" sizes="114x114" href="/apple-icon-114x114.png">
    <link rel="apple-touch-icon" sizes="120x120" href="/apple-icon-120x120.png">
    <link rel="apple-touch-icon" sizes="144x144" href="/apple-icon-144x144.png">
    <link rel="apple-touch-icon" sizes="152x152" href="/apple-icon-152x152.png">
    <link rel="apple-touch-icon" sizes="180x180" href="/apple-icon-180x180.png">
    <link rel="icon" type="image/png" href="/favicon-32x32.png" sizes="32x32">
    <link rel="icon" type="image/png" href="/android-chrome-192x192.png" sizes="192x192">
    <link rel="icon" type="image/png" href="/favicon-96x96.png" sizes="96x96">
    <link rel="icon" type="image/png" href="/favicon-16x16.png" sizes="16x16">
    <link rel="manifest" href="/manifest.json">
    <meta name="apple-mobile-web-app-title" content="Elastic">
    <meta name="application-name" content="Elastic">
    <meta name="msapplication-TileColor" content="#ffffff">
    <meta name="msapplication-TileImage" content="/mstile-144x144.png">
    <meta name="theme-color" content="#ffffff">
    <meta name="naver-site-verification" content="936882c1853b701b3cef3721758d80535413dbfd">
    <meta name="yandex-verification" content="d8a47e95d0972434">
    <meta name="localized" content="true">
    <meta name="st:robots" content="follow,index">
    <meta property="og:image" content="https://www.elastic.co/static/images/elastic-logo-200.png">
    <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
    <link rel="icon" href="/favicon.ico" type="image/x-icon">
    <link rel="apple-touch-icon-precomposed" sizes="64x64" href="/favicon_64x64_16bit.png">
    <link rel="apple-touch-icon-precomposed" sizes="32x32" href="/favicon_32x32.png">
    <link rel="apple-touch-icon-precomposed" sizes="16x16" href="/favicon_16x16.png">
    <!-- Give IE8 a fighting chance -->
    <!--[if lt IE 9]>
    <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
    <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
    <![endif]-->
    <link rel="stylesheet" type="text/css" href="/guide/static/styles.css">
  </head>

  <!--© 2015-2021 Elasticsearch B.V. Copying, publishing and/or distributing without written permission is strictly prohibited.-->

  <body>
    <!-- Google Tag Manager -->
    <script>dataLayer = [];</script><noscript><iframe src="//www.googletagmanager.com/ns.html?id=GTM-58RLH5" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
    <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= '//www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer','GTM-58RLH5');</script>
    <!-- End Google Tag Manager -->

    <!-- Global site tag (gtag.js) - Google Analytics -->
    <script async src="https://www.googletagmanager.com/gtag/js?id=UA-12395217-16"></script>
    <script>
      window.dataLayer = window.dataLayer || [];
      function gtag(){dataLayer.push(arguments);}
      gtag('js', new Date());
      gtag('config', 'UA-12395217-16');
    </script>

    <!--BEGIN QUALTRICS WEBSITE FEEDBACK SNIPPET-->
    <script type="text/javascript">
      (function(){var g=function(e,h,f,g){
      this.get=function(a){for(var a=a+"=",c=document.cookie.split(";"),b=0,e=c.length;b<e;b++){for(var d=c[b];" "==d.charAt(0);)d=d.substring(1,d.length);if(0==d.indexOf(a))return d.substring(a.length,d.length)}return null};
      this.set=function(a,c){var b="",b=new Date;b.setTime(b.getTime()+6048E5);b="; expires="+b.toGMTString();document.cookie=a+"="+c+b+"; path=/; "};
      this.check=function(){var a=this.get(f);if(a)a=a.split(":");else if(100!=e)"v"==h&&(e=Math.random()>=e/100?0:100),a=[h,e,0],this.set(f,a.join(":"));else return!0;var c=a[1];if(100==c)return!0;switch(a[0]){case "v":return!1;case "r":return c=a[2]%Math.floor(100/c),a[2]++,this.set(f,a.join(":")),!c}return!0};
      this.go=function(){if(this.check()){var a=document.createElement("script");a.type="text/javascript";a.src=g;document.body&&document.body.appendChild(a)}};
      this.start=function(){var a=this;window.addEventListener?window.addEventListener("load",function(){a.go()},!1):window.attachEvent&&window.attachEvent("onload",function(){a.go()})}};
      try{(new g(100,"r","QSI_S_ZN_emkP0oSe9Qrn7kF","https://znemkp0ose9qrn7kf-elastic.siteintercept.qualtrics.com/WRSiteInterceptEngine/?Q_ZID=ZN_emkP0oSe9Qrn7kF")).start()}catch(i){}})();
    </script><div id="ZN_emkP0oSe9Qrn7kF"><!--DO NOT REMOVE-CONTENTS PLACED HERE--></div>
    <!--END WEBSITE FEEDBACK SNIPPET-->

    <div id="elastic-nav" style="display:none;"></div>
    <script src="https://www.elastic.co/elastic-nav.js"></script>

    <!-- Subnav -->
    <div>
      <div>
        <div class="tertiary-nav d-none d-md-block">
          <div class="container">
            <div class="p-t-b-15 d-flex justify-content-between nav-container">
              <div class="breadcrum-wrapper"><span><a href="/guide/" style="font-size: 14px; font-weight: 600; color: #000;">Docs</a></span></div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="main-container">
      <section id="content">
        <div class="content-wrapper">

          <section id="guide" lang="en">
            <div class="container">
              <div class="row">
                <div class="col-xs-12 col-sm-8 col-md-8 guide-section">
                  <!-- start body -->
                  <div class="page_header">
<p>
  <strong>WARNING</strong>: The 2.x versions of Elasticsearch have passed their
  <a href="https://www.elastic.co/support/eol">EOL dates</a>. If you are running
  a 2.x version, we strongly advise you to upgrade.
</p>
<p>
  This documentation is no longer maintained and may be removed. For the latest
  information, see the <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html">current
  Elasticsearch documentation</a>.
</p>
</div>
<div id="content">
<div class="breadcrumbs">
<span class="breadcrumb-link"><a href="index.html">Elasticsearch: The Definitive Guide [2.x]</a></span>
»
<span class="breadcrumb-link"><a href="getting-started.html">Getting Started</a></span>
»
<span class="breadcrumb-link"><a href="mapping-analysis.html">Mapping and Analysis</a></span>
»
<span class="breadcrumb-node">Analysis and Analyzers</span>
</div>
<div class="navheader">
<span class="prev">
<a href="inverted-index.html">« Inverted Index</a>
</span>
<span class="next">
<a href="mapping-intro.html">Mapping »</a>
</span>
</div>
<div class="section pagebreak-before">
<div class="titlepage"><div><div>
<h2 class="title">
<a id="analysis-intro"></a>Analysis and Analyzers<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/052_Mapping_Analysis/40_Analysis.asciidoc">edit</a>
</h2>
</div></div></div>
<p><em>Analysis</em> is a process that consists of the following:</p>
<div class="ulist itemizedlist">
<ul class="itemizedlist">
<li class="listitem">
First, tokenizing a block of text into
individual <em>terms</em> suitable for use in an inverted index,
</li>
<li class="listitem">
Then normalizing these terms into a standard form to improve their
“searchability,” or <em>recall</em>
</li>
</ul>
</div>
<p>This job is performed by analyzers. An <em>analyzer</em> is really just a wrapper
that combines three functions into a single package:</p>
<div class="variablelist">
<dl class="variablelist">
<dt>
<span class="term">
Character filters
</span>
</dt>
<dd>
First, the string is passed through any <em>character filters</em> in turn. Their
job is to tidy up the string before tokenization. A character filter could
be used to strip out HTML, or to convert <code class="literal">&amp;</code> characters to the word
<code class="literal">and</code>.
</dd>
<dt>
<span class="term">
Tokenizer
</span>
</dt>
<dd>
Next, the string is tokenized into individual terms by a <em>tokenizer</em>. A
simple tokenizer might split the text into terms whenever it encounters
whitespace or punctuation.
</dd>
<dt>
<span class="term">
Token filters
</span>
</dt>
<dd>
Last, each term is passed through any <em>token filters</em> in turn, which can
change terms (for example, lowercasing <code class="literal">Quick</code>), remove terms (for example, stopwords such as
<code class="literal">a</code>, <code class="literal">and</code>, <code class="literal">the</code>) or add terms (for example, synonyms like <code class="literal">jump</code> and
<code class="literal">leap</code>).
</dd>
</dl>
</div>
<p>Elasticsearch provides many character filters, tokenizers, and token filters
out of the box. These can be combined to create custom analyzers suitable
for different purposes. We discuss these in detail in <a class="xref" href="custom-analyzers.html" title="Custom Analyzers">Custom Analyzers</a>.</p>
<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="_built_in_analyzers"></a>Built-in Analyzers<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/052_Mapping_Analysis/40_Analysis.asciidoc">edit</a>
</h3>
</div></div></div>
<p>However, Elasticsearch also ships with prepackaged analyzers that
you can use directly. We list the most important ones next and, to demonstrate
the difference in behavior, we show what terms each would produce
from this string:</p>
<pre class="literallayout">"Set the shape to semi-transparent by calling set_trans(5)"</pre>

<div class="variablelist">
<dl class="variablelist">
<dt>
<span class="term">
Standard analyzer
</span>
</dt>
<dd>
<p>
The standard analyzer is the default analyzer that Elasticsearch uses. It is
the best general choice for analyzing text that may be in any language. It
splits the text on <em>word boundaries</em>, as defined by the
<a href="http://www.unicode.org/reports/tr29/" class="ulink" target="_top">Unicode Consortium</a>, and removes most
punctuation. Finally, it lowercases all terms. It would produce
</p>
<pre class="literallayout">set, the, shape, to, semi, transparent, by, calling, set_trans, 5</pre>

</dd>
<dt>
<span class="term">
Simple analyzer
</span>
</dt>
<dd>
<p>
The simple analyzer splits the text on anything that isn’t a letter,
and lowercases the terms. It would produce
</p>
<pre class="literallayout">set, the, shape, to, semi, transparent, by, calling, set, trans</pre>

</dd>
<dt>
<span class="term">
Whitespace analyzer
</span>
</dt>
<dd>
<p>
The whitespace analyzer splits the text on whitespace. It doesn’t
lowercase. It would produce
</p>
<pre class="literallayout">Set, the, shape, to, semi-transparent, by, calling, set_trans(5)</pre>

</dd>
<dt>
<span class="term">
Language analyzers
</span>
</dt>
<dd>
<p>
Language-specific analyzers are available for <a href="/guide/en/elasticsearch/reference/2.4/analysis-lang-analyzer.html" class="ulink" target="_top">many languages</a>. They are able to
take the peculiarities of the specified language into account. For instance,
the <code class="literal">english</code> analyzer comes with a set of English stopwords (common words
like <code class="literal">and</code> or <code class="literal">the</code> that don’t have much impact on relevance), which it
removes. This analyzer also is able to <em>stem</em> English words because it understands the
rules of English grammar.
</p>
<p>The <code class="literal">english</code> analyzer would produce the following:</p>
<pre class="literallayout">set, shape, semi, transpar, call, set_tran, 5</pre>

<p>Note how <code class="literal">transparent</code>, <code class="literal">calling</code>, and <code class="literal">set_trans</code> have been stemmed to
their root form.</p>
</dd>
</dl>
</div>
</div>

<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="_when_analyzers_are_used"></a>When Analyzers Are Used<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/052_Mapping_Analysis/40_Analysis.asciidoc">edit</a>
</h3>
</div></div></div>
<p>When we <em>index</em> a document, its full-text fields are analyzed into terms that
are used to create the inverted index.  However, when we <em>search</em> on a full-text field,  we need to pass the query string through the <em>same analysis
process</em>, to ensure that we are searching for terms in the same form as those
that exist in the index.</p>
<p>Full-text queries, which we discuss later, understand how each field is
defined, and so they can do the right thing:</p>
<div class="ulist itemizedlist">
<ul class="itemizedlist">
<li class="listitem">
When you query a <em>full-text</em> field, the query will apply the same analyzer
to the query string to produce the correct list of terms to search for.
</li>
<li class="listitem">
When you query an <em>exact-value</em> field, the query will not analyze the
query string, but instead search for the exact value that you have
specified.
</li>
</ul>
</div>
<p>Now you can understand why the queries that we demonstrated at the
<a class="xref" href="mapping-analysis.html" title="Mapping and Analysis">start of this chapter</a> return what they do:</p>
<div class="ulist itemizedlist">
<ul class="itemizedlist">
<li class="listitem">
The <code class="literal">date</code> field contains an exact value: the single term <code class="literal">2014-09-15</code>.
</li>
<li class="listitem">
The <code class="literal">_all</code> field is a full-text field, so the analysis process has
converted the date into the three terms: <code class="literal">2014</code>, <code class="literal">09</code>, and <code class="literal">15</code>.
</li>
</ul>
</div>
<p>When we query the <code class="literal">_all</code> field for <code class="literal">2014</code>, it matches all 12 tweets, because
all of them contain the term <code class="literal">2014</code>:</p>
<div class="pre_wrapper lang-sense">
<pre class="programlisting prettyprint lang-sense">GET /_search?q=2014              # 12 results</pre>
</div>
<div class="sense_widget" data-snippet="snippets/052_Mapping_Analysis/25_Data_type_differences.json"></div>
<p>When we query the <code class="literal">_all</code> field for <code class="literal">2014-09-15</code>, it first analyzes the query
string to produce a query that matches <em>any</em> of the terms <code class="literal">2014</code>, <code class="literal">09</code>, or
<code class="literal">15</code>. This also matches all 12 tweets, because all of them contain the term
<code class="literal">2014</code>:</p>
<div class="pre_wrapper lang-sense">
<pre class="programlisting prettyprint lang-sense">GET /_search?q=2014-09-15        # 12 results !</pre>
</div>
<div class="sense_widget" data-snippet="snippets/052_Mapping_Analysis/25_Data_type_differences.json"></div>
<p>When we query the <code class="literal">date</code> field for <code class="literal">2014-09-15</code>, it looks for that <em>exact</em>
date, and finds one tweet only:</p>
<div class="pre_wrapper lang-sense">
<pre class="programlisting prettyprint lang-sense">GET /_search?q=date:2014-09-15   # 1  result</pre>
</div>
<div class="sense_widget" data-snippet="snippets/052_Mapping_Analysis/25_Data_type_differences.json"></div>
<p>When we query the <code class="literal">date</code> field for <code class="literal">2014</code>, it finds no documents
because none contain that exact date:</p>
<div class="pre_wrapper lang-sense">
<pre class="programlisting prettyprint lang-sense">GET /_search?q=date:2014         # 0  results !</pre>
</div>
<div class="sense_widget" data-snippet="snippets/052_Mapping_Analysis/25_Data_type_differences.json"></div>
</div>

<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="analyze-api"></a>Testing Analyzers<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/052_Mapping_Analysis/40_Analysis.asciidoc">edit</a>
</h3>
</div></div></div>
<p>Especially when you are new to Elasticsearch, it is sometimes difficult to
understand what is actually being tokenized and stored into your index.  To
better understand what is going on, you can use the <code class="literal">analyze</code> API to see how
text is analyzed. Specify which analyzer to use in the query-string
parameters,  and the text to analyze in the body:</p>
<div class="pre_wrapper lang-sense">
<pre class="programlisting prettyprint lang-sense">GET /_analyze
{
  "analyzer": "standard",
  "text": "Text to analyze"
}</pre>
</div>
<div class="sense_widget" data-snippet="snippets/052_Mapping_Analysis/40_Analyze.json"></div>
<p>Each element in the result represents a single term:</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">{
   "tokens": [
      {
         "token":        "text",
         "start_offset": 0,
         "end_offset":   4,
         "type":         "&lt;ALPHANUM&gt;",
         "position":     1
      },
      {
         "token":        "to",
         "start_offset": 5,
         "end_offset":   7,
         "type":         "&lt;ALPHANUM&gt;",
         "position":     2
      },
      {
         "token":        "analyze",
         "start_offset": 8,
         "end_offset":   15,
         "type":         "&lt;ALPHANUM&gt;",
         "position":     3
      }
   ]
}</pre>
</div>
<p>The <code class="literal">token</code> is the actual term that will be stored in the index. The
<code class="literal">position</code> indicates the order in which the terms appeared in the original
text. The <code class="literal">start_offset</code> and <code class="literal">end_offset</code> indicate the character positions
that the original word occupied in the original string.</p>
<div class="tip admon">
<div class="icon"></div>
<div class="admon_content">
<p>The <code class="literal">type</code> values like <code class="literal">&lt;ALPHANUM&gt;</code> vary per analyzer and can be ignored.
The only place that they are used in Elasticsearch is in the
<a href="/guide/en/elasticsearch/reference/2.4/analysis-keep-types-tokenfilter.html" class="ulink" target="_top"><code class="literal">keep_types</code> token filter</a>.</p>
</div>
</div>
<p>The <code class="literal">analyze</code> API is a useful tool for understanding what is happening
inside Elasticsearch indices, and we will talk more about it as we progress.</p>
</div>

<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="_specifying_analyzers"></a>Specifying Analyzers<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/052_Mapping_Analysis/40_Analysis.asciidoc">edit</a>
</h3>
</div></div></div>
<p>When Elasticsearch detects a new string field in your documents, it
automatically configures it as a full-text <code class="literal">string</code> field and analyzes it with
the <code class="literal">standard</code> analyzer.</p>
<p>You don’t always want this. Perhaps you want to apply a different analyzer
that suits the language your data is in. And sometimes you want a
string field to be just a string field—​to index the exact value that
you pass in, without any analysis, such as a string user ID or an
internal status field or tag.</p>
<p>To achieve this, we have to configure these fields manually
by specifying the mapping.</p>
</div>

</div>
<div class="navfooter">
<span class="prev">
<a href="inverted-index.html">« Inverted Index</a>
</span>
<span class="next">
<a href="mapping-intro.html">Mapping »</a>
</span>
</div>
</div>

                  <!-- end body -->
                </div>
                <div class="col-xs-12 col-sm-4 col-md-4" id="right_col">
                  <div id="rtpcontainer" style="display: block;">
                    <div class="mktg-promo">
                      <h3>Most Popular</h3>
                      <ul class="icons">
                        <li class="icon-elasticsearch-white"><a href="https://www.elastic.co/webinars/getting-started-elasticsearch?baymax=default&amp;elektra=docs&amp;storm=top-video">Get Started with Elasticsearch: Video</a></li>
                        <li class="icon-kibana-white"><a href="https://www.elastic.co/webinars/getting-started-kibana?baymax=default&amp;elektra=docs&amp;storm=top-video">Intro to Kibana: Video</a></li>
                        <li class="icon-logstash-white"><a href="https://www.elastic.co/webinars/introduction-elk-stack?baymax=default&amp;elektra=docs&amp;storm=top-video">ELK for Logs &amp; Metrics: Video</a></li>
                      </ul>
                    </div>
                  </div>
                </div>
              </div>
            </div>
          </section>

        </div>


<div id="elastic-footer"></div>
<script src="https://www.elastic.co/elastic-footer.js"></script>
<!-- Footer Section end-->

      </section>
    </div>

<script src="/guide/static/jquery.js"></script>
<script type="text/javascript" src="/guide/static/docs.js"></script>
<script type="text/javascript">
  window.initial_state = {}</script>
  </body>
</html>
